/* istanbul ignore file */
'use strict'
// @ts-check
/**
 * 用于重置分类器的键
 * keys we use to serialize a classifier's state
 * fork from https://github.com/ttezel/bayes
 */

const STATE_KEYS = ['categories', 'docCount', 'totalDocuments', 'vocabulary', 'wordCount', 'wordFrequencyCount', 'options']

/**
 * 默认分词器，英文按照空格分割单词，中文按照字符分割 可使用其他分词器代替
 * Given an input string, tokenize it into an array of word tokens.
 * This is the default tokenization function used if user does not provide one in `options`.
 *
 * @param  {String} text
 * @return {Array}
 */

const defaultTokenizer = text => {
  // 仅保留英文、中文、数字
  const rgxPunctuation = /[^(a-zA-ZA-Яa-я\u4e00-\u9fa50-9_)+\s]/g
  // 英文以空格分词，中文不分词，以单个字为单位
  return text
    .replace(rgxPunctuation, ' ')
    .replace(/[\u4e00-\u9fa5]/g, word => `${word} `)
    .split(/\s+/)
}

/**
 * Naive-Bayes Classifier 朴素贝叶斯
 *
 * This is a naive-bayes classifier that uses Laplace Smoothing.
 *
 */

class NaiveBayes {
  constructor (opt) {
    this.options = {}
    if (typeof opt !== 'undefined') {
      if (!opt || typeof opt !== 'object' || Array.isArray(opt)) {
        console.error('配置，需要是对象')
        return -1
      }
      this.options = opt
    }
    // 分词器
    this.tokenizer = this.options.tokenizer || defaultTokenizer
    // 词汇表
    this.vocabulary = []
    // 已学习的文档总数量, number of documents we have learned from
    this.totalDocuments = 0
    // 分类的词频表, document frequency table for each of our categories
    this.docCount = {}
    // 分类词总数/概率基数, for each category, how many words total were mapped to it
    this.wordCount = {}
    // 分类的词频统计, word frequency table for each category
    this.wordFrequencyCount = {}
    // 所有分类, hashmap of our category names
    this.categories = []
  }

  /**
   * 初始化新分类
   * Initialize each of our data structure entries for this new category
   *
   * @param  {String} categoryName
   */

  initializeCategory (categoryName) {
    if (!this.categories.includes(categoryName)) {
      this.docCount[categoryName] = 0
      this.wordCount[categoryName] = 0
      this.wordFrequencyCount[categoryName] = {}
      this.categories.push(categoryName)
    }
    return this
  }

  /**
   * 训练朴素贝叶斯分类器，告诉它分类关系
   * train our naive-bayes classifier by telling it what `category`
   * the `text` corresponds to.
   *
   * @param  {String} text
   * @param  {String} class
   */

  learn (text, category) {
    // 初始化分类, initialize category data structures if we've never seen this category
    this.initializeCategory(category)
    // 更新这个分类映射的语句的数量（用于计算后面的 P(C) ）
    // update our count of how many documents mapped to this category
    this.docCount[category]++
    // 更新已学习的文档总数, update the total number of documents we have learned from
    this.totalDocuments++
    // 将文本标准化为词汇数组, normalize the text into a word array
    const tokens = this.tokenizer(text)
    // 获取文本中每个词汇的词频（用于更新总词频）, get a frequency count for each token in the text
    const frequencyTable = this.frequencyTable(tokens)

    /*
     * 更新我们的词汇和我们的词频计数这个分类
     * Update our vocabulary and our word frequency count for this category
     */

    Object.keys(frequencyTable).forEach(token => {
      // 将目标词汇添加到词汇表, add this word to our vocabulary if not already existing
      if (!this.vocabulary.includes(token)) {
        this.vocabulary.push(token)
      }
      const frequencyInText = frequencyTable[token]
      // 在这个分类中更新这个词的频率信息（更新总词频）, update the frequency information for this word in this category
      if (!this.wordFrequencyCount[category][token]) {
        this.wordFrequencyCount[category][token] = frequencyInText
      } else {
        this.wordFrequencyCount[category][token] += frequencyInText
      }
      // 更新我们已经看到映射到这个分类的所有词汇的计数（C.wordCount，用于计算词类概率）
      // update the count of all words we have seen mapped to this category
      this.wordCount[category] += frequencyInText
    })
    return this
  }

  /**
   * 进行分类，或者说进行预测
   * Determine what category `text` belongs to.
   *
   * @param  {String} text
   * @param  {Boolean} probability
   * @return {String} category
   */

  categorize (text, probability) {
    return probability ? this.probabilities(text)[0] : this.probabilities(text)[0].category
  }

  /**
   * 返回一个数组，数组内部是按照概率从高到低排序的组合
   * Determine category probabilities for `text`.
   *
   * @param  {String} text
   * @return {Array} probabilities
   */

  probabilities (text) {
    // [W1,W2,W3,W4,Wn...]
    const tokens = this.tokenizer(text)
    const frequencyTable = this.frequencyTable(tokens)
    // 返回由 P(W1|C) * P(W2|C) ... P(Wn|C) * P(C) 组成的数组
    // iterate thru our categories to calculate the probability for this text
    return this.categories
      .map(category => {
        // start by calculating the overall probability of this category
        // => out of all documents we've ever looked at, how many were
        //    mapped to this category
        const categoryProbability = this.docCount[category] / this.totalDocuments
        // take the log to avoid underflow
        let logProbability = Math.log(categoryProbability)
        // now determine P( w | c ) for each word `w` in the text
        Object.keys(frequencyTable).forEach(token => {
          const frequencyInText = frequencyTable[token]
          const tokenProbability = this.tokenProbability(token, category)
          // console.log('token: %s category: `%s` tokenProbability: %d', token, category, tokenProbability)
          // determine the log of the P( w | c ) for this word
          logProbability += frequencyInText * Math.log(tokenProbability)
        })
        return {
          category: category,
          probability: logProbability
        }
      })
      .sort((prev, next) => next.probability - prev.probability)
  }

  /**
   * 概率计算器，用于计算"元素"属于"分类"的概率
   * Calculate probability that a `token` belongs to a `category`
   *
   * @param  {String} token
   * @param  {String} category
   * @return {Number} probability
   */

  tokenProbability (token, category) {
    // 分类中目标词汇的词频
    const wordFrequencyCount = this.wordFrequencyCount[category][token] || 0
    // 分类总词汇数量
    const wordCount = this.wordCount[category]
    // 拉普拉斯方程，防止概率为0，P(W|C)
    return (wordFrequencyCount + 1) / (wordCount + this.vocabulary.length)
  }

  /**
   * 概率HashMap
   * Build a frequency hashmap where
   * - the keys are the entries in `tokens`
   * - the values are the frequency of each entry in `tokens`
   *
   * @param  {Array} tokens  Normalized word array
   * @return {Object}
   */

  frequencyTable (tokens) {
    const frequencyTable = Object.create(null)
    tokens.forEach(token => {
      if (!frequencyTable[token]) {
        frequencyTable[token] = 1
      } else {
        frequencyTable[token]++
      }
    })
    return frequencyTable
  }

  /**
   * Dump the classifier's state as a JSON string.
   * @param {Boolean} Optionally format the serialized JSON output for easier human consumption
   * @return {String} Representation of the classifier.
   */

  toJson () {
    return JSON.stringify(this.toJsonObject(), null, 2)
  }

  toJsonObject () {
    const me = this
    const state = {}
    STATE_KEYS.forEach(function (key) {
      state[key] = me[key]
    })
    return state
  }

  /**
   * 从JSON初始化贝叶斯分类器实例（json对象，不是字符串对象）
   * Initializes a NaiveBayes instance from a JSON state representation.
   * Use this with classifier.toJson().
   *
   * @param  {String} jsonStr   state representation obtained by classifier.toJson()
   * @return {NaiveBayes}       Classifier
   */

  static fromJson (o) {
    try {
      o = JSON.parse(o)
    } catch (e) {
      console.error('JSON格式不对')
      return -1
    }
    o.options = o.options || {}
    // init a new classifier
    const classifier = new NaiveBayes(o.options)
    // override the classifier's state
    STATE_KEYS.forEach(key => {
      if (o[key] === void 0) {
        console.error(`朴素贝叶斯.fromJson: JSON 属性丢失: '${key}'.`)
      } else {
        classifier[key] = o[key]
      }
    })
    return classifier
  }
}
module.exports = NaiveBayes
